Commonly, machine learning algorithms can be built from 4 different modules: data, a model, a learning algorithm, and an optimization procedure. Most of the times, these 4 modules can be freely combined with other ones.
We structured this notebook with these 4 modules in mind.
In Section 0 we do some maintenance - 0.1 defines the libraries that we use and 0.2 defines the hyper paramaters of our notebook, and in 0.3 we load the trainingsdata from mnist.
In Section 1, we deal with the first module - data. In Section 1.1 we split the data to trainsgsbatches.
In Section 2, we define the VAMPprior model. We first introduce two layers that we will use as building blocks for our model. In Section 2.1 we describe the gated linear layer and in Section 2.2 we will describe a linear layer. The linear layer $Wx+b$ could have been reused from the tf.contrib, however, we choose to add it for clarity.
In Section 3 we define the third module - our learning objective. Our learning objective consists of two parts: the reconstruction error between z and x, and the KL divergence between the prior P(z) and the approximate posterior Q(z|x). The Prior p(z) is parametrized, i.e. it is not a standard gaussian anymore but a Mixture of Gaussians.
In Section 4 we describe the optimization procedure. We use Adam, which is a momentum-based form of SDG.
In Section 5 we perform the actual training.
In [ ]:
import tensorflow as tf
import numpy as np
np.set_printoptions(precision=2)
import logging
import random
import os
from os.path import join as j
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from IPython.display import Image # displaying images
from tensorflow.python.client import timeline
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import time
# load trainings data
from keras.datasets import mnist
import itertools
assert(tf.__version__=="1.2.0") # make sure to have the right tensorflow verson
In [ ]:
(train_features, _), (test_features, _) = mnist.load_data() # shape = [60000, 28,28] and [10000, 28,28]
features = np.concatenate((train_features,test_features), axis = 0) # shape = [70000, 28,28]
features.astype('float32')
features = features / 255.0 # normalize pixel values between 0 - 1
features = features.reshape(-1, 28*28) # => [70000, 784]
num_examples = len(features)# 70000
x = features # our input data
In [ ]:
# set random seed for reproducability
RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)
graph_outdir = "graph-vamp"
model_name ="nn-vamp"
learning_rate = 0.001
tf_beta = tf.Variable(0.01, trainable=False, dtype=tf.float32)
x_dim = 28*28 # our mnist datapixel
# encoder
enc_l1_dim = 300
enc_l2_dim = 300
z_dim = 40
# decoder
dec_l1_dim = 300
dec_l2_dim = 300
tf_global_step = tf.Variable(0, trainable=False)
# VAMPPrior
number_of_components = 500 # K in paper
# training
batch_size = 500
num_epochs = 2000
num_batches_per_epoch = int(num_examples / batch_size)
num_training_steps = num_batches_per_epoch * num_epochs
trainings_iterator = list(iter(zip(
list(range(1, num_batches_per_epoch*num_epochs+1)), # current_step
list(range(1, num_batches_per_epoch+1))*num_epochs, # current_batch
list( # [0000 ... 1111 ... 2222 ...]
itertools.chain(*[[e]*num_batches_per_epoch for e in range(1,num_epochs+1)])
) # current_epoch
)))
save_checkpoint_after_each_step = num_batches_per_epoch
print_loss_after_each_step = num_batches_per_epoch / 10 # print 10 status per epoch
In [ ]:
# generate trainings data
print("Total trainingsteps:%i"%len(trainings_iterator))
print("Preparing %i batches..."%num_batches_per_epoch)
data = {}
for batch_id, start_index in enumerate(range(0,num_examples, batch_size)):
data[batch_id+1]=x[start_index:start_index+batch_size] # [200, 784]
print("done")
In [ ]:
def gated_dense_layer(
layer_input, # x [batch_size, input_dimension]
in_dim, # [input_dimension]
out_dim, # [output_dimension]
name,
activation_function = tf.nn.sigmoid, # sigma in paper
dtype=tf.float32,
weight_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED)):
with tf.variable_scope(name):
# gate - A in paper
gate_weights = tf.get_variable(
shape=[in_dim, out_dim],
dtype=tf.float32,
name="%s_gate_weights"%name,
initializer=weight_initializer)
gate_bias = tf.get_variable(shape=[out_dim], name="%s_gate_bias"%name)
gate_linear = tf.nn.bias_add(value=tf.matmul(layer_input,gate_weights),bias=gate_bias)
# normal dense - B in paper
dense_weights = tf.get_variable(
shape=[in_dim, out_dim],
dtype=tf.float32,
name="%s_dense_weights"%name,
initializer=weight_initializer)
dense_bias = tf.get_variable(shape=[out_dim], name="%s_dense_bias"%name)
dense_linear = tf.nn.bias_add(value=tf.matmul(layer_input,dense_weights),bias=dense_bias)
dense_activated = activation_function(dense_linear)
layer_outputs = tf.multiply(x=gate_linear, y=dense_activated, name="%s_outputs"%name) # H_0 = A * sigma(B)
return layer_outputs # H_0
In [ ]:
def linear_layer(
layer_input,
in_dim, # [input_dimension]
out_dim, # [output_dimension]
name,
weight_initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED)):
with tf.variable_scope(name):
# gate - A in paper
weights = tf.get_variable(
shape=[in_dim, out_dim],
dtype=tf.float32,
name="%s_weights"%name,
initializer=weight_initializer)
bias = tf.get_variable(shape=[out_dim], name="%s_bias"%name)
linear = tf.nn.bias_add(value=tf.matmul(layer_input,weights),bias=bias)
return linear
In the following two figures, red means that gradients cannot flow back, because of the stochasticity of the nodes. If we would sample from the distribution $\mathcal{N}(z;\mu, \Sigma)$ directly, we could not backpropagate.
In constrast, after using the reparametrization trick $z=\mu + \epsilon * \sqrt{\Sigma}$ with $\epsilon \sim \mathcal{N}(0,1)$ we can backpropagate to $\Sigma$ and $\mu$. We still cannot backpropagate to $\epsilon$, but we also don't need to.
In [ ]:
def variational_encoder(x):
# encoder # q(z | x)
with tf.variable_scope("VariationalEncoder"):
enc_h1 = gated_dense_layer(layer_input=x, in_dim=x_dim, out_dim=enc_l1_dim, name="Layer1") # [batch_size, 300]
enc_h2 = gated_dense_layer(layer_input=enc_h1, in_dim=enc_l1_dim, out_dim=enc_l2_dim, name="Layer2") # [batch_size, 300]
z_q_mean = linear_layer(enc_h2, in_dim=enc_l2_dim, out_dim=z_dim, name="z_mean") # [batch_size, z_dim]
z_q_logvar = linear_layer(enc_h2, in_dim=enc_l2_dim, out_dim=z_dim, name="z_logvar") # [batch_size, z_dim]
return z_q_mean,z_q_logvar
In [ ]:
def reparametrize(z_q_mean, z_q_logvar):
# https://arxiv.org/pdf/1312.6114.pdf, Page 5 in the very bottom. # [bs, z_size]
with tf.name_scope("Repararmetrization_Trick"): # this is the reparameterization trick
# z_sigma = z_q_logvar
# z_mu = z_q_mean
epsilon = tf.random_normal( # draw some uniform random noise
tf.shape(z_q_logvar),
mean=0.0,
stddev=1.0,
dtype=tf.float32,
name="random_variable_epsilon")
std = tf.sqrt(tf.exp(z_q_logvar)) # e^(1/2 * ln(var)) = sqrt(var)
z_q = z_q_mean + epsilon * std # [batch_size, z_size]
return z_q # z_q means z~Q(z|x). In words: that z was sampled from the approximate posterior Q(z|x)
In [ ]:
def variational_decoder(z_q):
# decoder - p(x|z)
with tf.variable_scope("VariationalDecoder"): # x_mean = p(x|z)
dec_h1 = gated_dense_layer(layer_input=z_q, in_dim=z_dim, out_dim=dec_l1_dim, name="DecLayer1") # [batch_size, 300]
dec_h2 = gated_dense_layer(layer_input=dec_h1, in_dim=dec_l1_dim, out_dim=dec_l2_dim, name="DecLayer2") # [batch_size, 300]
x_mean = linear_layer(dec_h2, in_dim=dec_l2_dim, out_dim=x_dim, name="x_mean") # [batch_size, 784]
# x_mean is our predicted x from the sampled z. so we have: P(x|z)P(z)
# we want to maximize the probability that x_mean looks like our original x
return x_mean
In [ ]:
# input
with tf.variable_scope("Input_features_%s"%x_dim):
x_input = tf.placeholder(shape=[None, x_dim], dtype=tf.float32, name="input_features")# shape: [batch_size, x_dim]
with tf.variable_scope("Input_VAMPprior"):
# prior: p(z) = 1/K sum_k N(mean_k, var_k)
# mixture of Gaussians parameters
component_means = tf.get_variable( # u in paper
shape=[number_of_components, x_dim], #[500, 784]
dtype=tf.float32,
name="vamp_prior_weights",
initializer=tf.contrib.layers.xavier_initializer(seed=RANDOM_SEED))
with tf.variable_scope("VAMPModel") as scope:
# encoder
z_q_mean,z_q_logvar = variational_encoder(x=x_input)
# reparametrize
z_q = reparametrize(z_q_mean=z_q_mean, z_q_logvar=z_q_logvar)
# vamp_prior
scope.reuse_variables() # share parameters for the encoder
z_p_mean,z_p_logvar = variational_encoder(x=component_means) #number_components x z_dim
print("z_p_mean", z_p_mean.shape)
print("z_p_logvar", z_p_logvar.shape)
# decoder
x_mean = variational_decoder(z_q)
Our objective consists of two parts. First, the reconstruction error $RE$ between $x$ and $x_{reconstructed}$ and the KL divergence between the two probability distributions $Q(z|x)$ (the approximate posterior) and $P(z)$ (the prior).
The reconstruction error $RE$ is simply the sigmoid cross entropy loss, which is related to the negative log loss.
For the KL divergence, we assume that our prior $P(z)$ follows a multivariate standard normal distribution, and our approximate posterior $Q(x|z)$ follows a diogonal normal standard distribution. To be able to calculate the KL divergence, we first need to calculate the natural logarithm $ln$ of the two probability density functions.
calculates the natrual logarithm of $ln$ of probability distribution following a multivariate diogonal normal distribution $$ln(\mathcal{N}_{diagonal}(z\,;\,\mu,\Sigma))$$.
The probability density function for a multivariate Gaussian distribution is:
$$ \mathcal{N}_{diagonal} = \frac{1}{ (2\pi)^n * sqrt(det(\Sigma))} * \exp(-\frac{1}{2} (z-\mu)^T\,\Sigma^{-1}*(z-\mu)) $$Since $\Sigma$ is a diogonal matrix, there are two simplifications of this expression: First, $$\Sigma^{-1} = 1/\Sigma$$, and the term in the $exp()$ function can then further be simplified to $$1/\Sigma \,(z-\mu)^2$$
This expression can further simplified by replacing it with a sum (column vector times row vector): $$1/\Sigma \,(z-\mu)^2 = \sum_{i=0}^n \frac{1}{\sigma_i^2}(z_i-\mu_i)^2$$
Second, the determinant of a diogonal matrix is simply the product of all elements, which allows us to express $$\frac{1}{sqrt(det(\Sigma))} = (\prod_{i=0}^n \sigma_i^2)^{-1/2} $$
Taking the logarithm $ln$ and pluggin in these simplicfications, we end up with:
$$ ln(\mathcal{N}_{diagonal}) = ln(\frac{1}{(2\pi)^n}) + ln((\prod_{i=0}^n \sigma_i^2)^{-1/2}) + ln(\exp(-\frac{1}{2} \sum_{i=0}^n \frac{1}{\sigma_i^2}(z_i-\mu_i)^2)) $$If we drop the first term $ln(\frac{1}{(2\pi)^n})$ because it cancels itself out in the KL divergence calculation, and evaluate the logarithm we will end up with:
$$ ln(\mathcal{N}_{diagonal}) = - \frac{1}{2} \sum_{i=0}^n ln(\sigma_i^2) - \frac{1}{2} \sum_{i=0}^n \frac{1}{\sigma_i^2}(z_i-\mu_i)^2) $$If we join the summation, we end up with:
$$ ln(\mathcal{N}_{diagonal}) = - \frac{1}{2} \sum_{i=0}^n ( ln(\sigma_i^2) + \frac{1}{\sigma_i^2}(z_i-\mu_i)^2) $$
In [ ]:
# approximate posterior q(z|x)
def ln_diagonal_normal(z, mean, log_sigma, axis=1): # [batch_size, 1]
# our layer ouputs log_sigma. Therefor, we can use tf.exp(log_sigma) to get sigma.
log_q_normal = -0.5 * ( log_sigma + tf.pow(z-mean,2) * tf.pow(tf.exp(log_sigma),-1) )
return tf.reduce_sum( log_q_normal, axis=axis ) #batc
log_sum_exp https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
here our $P(z)$ is a multivariate, gaussian mixture distribution, where $i$ is the element of our batch, and $K$ is the total number of components ('number_of_components'), $\mu_k$ and $\Sigma_k$ are the mean and the covariance matrix. As with the VAE, the covariance matrix is a diogonal matrix, or a vector of size 'z_dim'.
z_1 [mu_k=1, sigma_k=1] # first z for all K components
z_1 [mu_k=2, sigma_k=2]
..
z_1 [mu_k=K, sigma_k=K]
z_2 [mu_k=1, sigma_k=1] # second z for all K components
..
z_n [mu_k=K, sigma_k=K] # n^th z for all K components
Then we calculate the diogonal log of a normal multivariate gaussian density function $\mathcal{N}(z_i;\mu_k, \Sigma_k)$ for all $i$ and $k$. After reshaping, our 'ln_components' has all the ln_components of one z example in one row.
We calculate the logsumexp for each example of the batch:
$$ln (\sum_{k=1}^K exp(ln(\mathcal{N}(z_i;\mu_k, \Sigma_k))))$$
Finally, we subtract ln(K) from each row, because the gaussian mixture had ln(1/K * \sum ), which is $\ln(\sum) + (-ln(K))
In [ ]:
# the log loss of the vamp prior
def ln_vampprior(
z,# [batch_size,z_dim]
mean, # [number_of_components, z_dim ]
log_sigma): # [number_of_components, z_dim ]
# for all z in batch, calculate the ln_diogonal_normal
# reshape z to [batch_size*num_components, z_dim]
z_tiled = tf.reshape(tf.tile(z, [1, number_of_components]), [-1, z_dim])
# reshape mu and log_sigma to [batch_size*num_components, z_dim]
mean_tiled = tf.reshape(tf.tile(mean, [batch_size,1]), [-1, z_dim])
log_sigma_tiled = tf.reshape(tf.tile(log_sigma, [batch_size,1]), [-1, z_dim])
# calculate ln(N) for each component
ln_components = ln_diagonal_normal( z_tiled, mean_tiled, log_sigma_tiled, axis=1 ) # => [batch_size*num_components, 1]
ln_components_per_example = tf.reshape(ln_components, [-1, number_of_components]) # [batch_size, number_of_components]
# calculate log_sum_exp for each ln_components_per_example,
ln_sum_per_example = tf.reduce_logsumexp( ln_components_per_example , axis=1 )
ln_p_z = ln_sum_per_example - tf.log(tf.cast(number_of_components,tf.float32))
return ln_p_z
This function calculates the KL divergence between two probability distributions. For details on the KL divergence see "Pattern Recognition in Machine Learning (PRML),CM Bishop, 2006, Section 1.2.2"
There, Equation (1.3.5) states that, if we are given a finite number N of points drawn from the probability distribution or probability density, then the expectation can be approximated as a finite sum over these points (for both continuous and discrete probabilities (PDF / PFM):
$$ \mathbb{E}[f] \simeq \frac{1}{N} \sum_{n=1}^N f(x_n) $$Then, in Section 1.6.1, Equation 1.113 we see that the KL divergence is given by:
$$ KL( Q(z|x) || P(z) ) = - \int q(z) ln(\frac{p(z)}{q(z)})\,dz $$If we evaluate ln in the fraction in the middle ($ln(\frac{a}{b})=ln(a)-ln(b)$), we end up with:
$$ KL( Q(z|x) || P(z) ) = - \int q(z) [ln(p(z)) - ln(q(z))] \,dz $$Note that we have swapped $Q$ with $P$ and that we use $z$ as random variable (instead of $x$). This can also be expressed as expectation.
$$ KL( Q(z|x) || P(z) ) = - \mathbb{E}_{z \sim Q} [ln(p(z)) - ln(q(z))] $$Which can be evaluated with equation (1.3.5)
Note that the constant term $ln(\frac{1}{ (2\pi)^n})$ was dropped because the occur in both of our probability distributions and then the cancel each other out.
In [ ]:
def kl_divergence(log_p, log_q): # [batch_size, 1]
return tf_beta * -1.0 * ( log_p - log_q ) # perhaps use reduce mean
In [ ]:
with tf.variable_scope("TraingsLoss"): # Reconstruction loss RE
# RE
# log_Bernoulli
# in jakups paper this is the negative log loss [batch_size, 1]
re_per_example = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=x_mean, labels=x_input), axis=1)
# KL
# VAMPprior p(z)
ln_p_z_per_example = ln_vampprior(z=z_q, mean=z_p_mean, log_sigma=z_p_logvar) # p_lambda_z in paper [batch_size, 1]
# approximate posterior q(z|x)
ln_q_z_per_example = ln_diagonal_normal(z=z_q, mean=z_q_mean, log_sigma=z_q_logvar) # [batch_size, 1]
kl_per_example = kl_divergence(log_p=ln_p_z_per_example, log_q=ln_q_z_per_example) # [batch_size, 1]
# total_loss
total_loss = tf.reduce_mean( re_per_example + kl_per_example) # batch_size
In [ ]:
with tf.variable_scope("OptimizationProcedure"):
# https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
with tf.variable_scope("ObtainGradients"):
params = tf.trainable_variables()
gradients = tf.gradients(
ys=total_loss,
xs=params,
)
grads_and_vars= zip(gradients, params)
with tf.variable_scope("ApplyGradients"):
adam_optimizer = tf.train.AdamOptimizer (
learning_rate=learning_rate,
name='AdamGradientDescent'
)
training_step = adam_optimizer.apply_gradients(
grads_and_vars=grads_and_vars,
global_step=tf_global_step,
name="apply_gradients_op"
)
In [ ]:
if not os.path.exists('reconstruction/'):
os.makedirs('reconstruction/')
# plots samples in a square
def plot_reconstruction( samples, epoch, size_x=3, size_y=3, name="reconstruction"):
if not (size_x * size_y) == len(samples): #
l = min(len(samples),50)
size_x = int(np.sqrt(l))
size_y = int(np.sqrt(l))
samples = samples[0:(size_x*size_y)]
fig = plt.figure(figsize=(size_x, size_y))
gs = gridspec.GridSpec(size_x, size_y)
gs.update(wspace=0.05, hspace=0.05)
for i, sample in enumerate(samples):
ax = plt.subplot(gs[i])
plt.axis('off')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_aspect('equal')
plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
outfile= "reconstruction/%s_%0.4d.png"%(name, epoch)
plt.savefig(outfile)
plt.close(fig)
try: # only in ipython notebook
display(Image(filename=outfile))
except: pass
def plot_latent_space(session, current_step, save_checkpoint_after_each_step):
# generate images
nx = ny = 20
x_values = np.linspace(-3, 3, nx)
y_values = np.linspace(-3, 3, ny)
z_mu_grid = []
canvas = np.empty((28*ny, 28*nx))
for yi in x_values:
for xi in y_values:
z_mu_grid.append([xi, yi])
z_mu_grid = np.array(z_mu_grid)
sampled_images = session.run(
fetches=[
tf.nn.sigmoid(x_mean),
],
feed_dict={
z_q:z_mu_grid
},
)[0]
current = 0
for i, yi in enumerate(x_values):
for k, xi in enumerate(y_values):
sampled_image = sampled_images[current]
canvas[(nx-i-1)*28:(nx-i)*28, k*28:(k+1)*28] = sampled_image.reshape(28, 28)
current+=1
plt.figure(figsize=(8, 10))
Xi, Yi = np.meshgrid(x_values, y_values)
plt.imshow(canvas, origin="upper", cmap="gray")
plt.tight_layout()
outfile = 'reconstruction/%04.d.png'%(current_step / save_checkpoint_after_each_step)
plt.savefig(outfile)
try: # only in ipython notebook
display(Image(filename=outfile))
except: pass
In [ ]:
tf.set_random_seed(RANDOM_SEED)
with tf.Session() as session:
logger.info("Random Seed: %0.3f"%session.run(tf.random_normal([1], mean=-1, stddev=4, seed=RANDOM_SEED))[0])
tf.summary.scalar("total_loss",tf.cast(total_loss, tf.float32)) # summary for loss
tf.summary.scalar("KL",tf.cast(tf.reduce_mean(kl_per_example), tf.float32)) # summary for loss
tf.summary.scalar("RE",tf.cast(tf.reduce_mean(re_per_example), tf.float32)) # summary for loss
all_summaries = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(graph_outdir, graph=session.graph)
session.run([
tf.local_variables_initializer(),
tf.global_variables_initializer(),
])
saver = tf.train.Saver(tf.global_variables()) # Saver
logger.info("Started training...")
#TODO move to graph
new_beta_ph = tf.placeholder(tf.float32, shape=())
update_beta = tf_beta.assign(new_beta_ph)
# debug output operations
mean_kl = tf.reduce_mean(kl_per_example)
mean_re = tf.reduce_mean(re_per_example)
mean_lnp = tf.reduce_mean(ln_p_z_per_example)
mean_lnq = tf.reduce_mean(ln_q_z_per_example)
session.graph.finalize() # prevent nodes beeing added to graph
st = time.time() # timing
for current_step, current_batch, current_epoch in trainings_iterator:
# warmup for KL divergence
new_beta=min((current_epoch-1)/100.0, 1.0)
session.run(update_beta, feed_dict={new_beta_ph:new_beta})
# get current batch
trainings_batch = data[current_batch]
# run trainings operation
_, tr_loss, tr_summaries, tr_kl, tr_re, tr_lnp, tr_lnq, tr_r, tr_k, tr_p,tr_q, x_reconstructed = session.run(
fetches=[
training_step,
total_loss,
all_summaries,
mean_kl,
mean_re,
mean_lnp,
mean_lnq,
re_per_example,
kl_per_example,
ln_p_z_per_example,
ln_q_z_per_example,
x_mean,
],
feed_dict={
x_input:trainings_batch
},
)
# write summaries and metadata info to graph
summary_writer.add_summary(tr_summaries, current_step)
# print training progress every now and then
if current_step % print_loss_after_each_step==0:
logger.info ("~StepT:%0.2fs Epoch %0.4d,Step:%i, Loss:%0.2f, KL:%0.2f, RE:%0.2f, ln(p):%0.2f, ln(q):%0.2f b:%0.2f"%(
(time.time()-st)/float(current_step),# current time per step
current_epoch, current_step,tr_loss, tr_kl, tr_re, tr_lnp, tr_lnq,new_beta)
)
# save checkpoints once in a while
if current_step % save_checkpoint_after_each_step==0:
logger.info ("Saving checkpoint %i"%(current_step / save_checkpoint_after_each_step))
saver.save(session, j(graph_outdir, model_name), global_step=tf_global_step)
logger.info ("Plotting Components %i"%(current_step / save_checkpoint_after_each_step))
components = session.run(component_means)
plot_reconstruction(components, current_epoch, name="components")
logger.info ("Plotting Reconstruction %i"%(current_step / save_checkpoint_after_each_step))
plot_reconstruction(x_reconstructed, current_epoch)
if z_dim == 2:
logger.info ("Plotting Latent Space %i"%(current_step / save_checkpoint_after_each_step))
plot_latent_space( session,current_step, save_checkpoint_after_each_step)
In [ ]: